Attempt to determine the top skills for a data scientist based on job listings for data scientist. Different text mining techniques are used to identify skills from the description section of the job listings.
# Libraries used
library(dplyr)
library(ggplot2)
library(ggraph)
library(igraph)
library(quanteda)
library(tidyr)
library(tidytext)
library(tidyverse)
library(topicmodels)
Following the NASA case study example presented at https://www.tidytextmining.com/nasa.html for Term Frequency and Topic Modeling.
# Read in CSV file of the 10000 jobs listings for Data Scientist
jobs_df <- read.csv(file = 'data_scientist_united_states_job_postings_jobspikr.csv', stringsAsFactors = FALSE)
# For testing purposes, just the first n rows only
jobs_df <- jobs_df[1:2000,]
Job description is the most important field in the dataset for the data mining exercise. The field contains the complete write-up posted for the job listing.
jobs_desc <- tibble(id = jobs_df$uniq_id,
desc = jobs_df$job_description)
jobs_desc %>%
select(desc) %>%
sample_n(5)
## # A tibble: 5 x 1
## desc
## <chr>
## 1 "Are you interested in expanding your career through experience and expo…
## 2 RESPONSIBILITIES: Kforce has a client seeking a Lead Data Scientist in C…
## 3 "Read what people are saying about working here. \n\nJob Type:\n\nReq ID…
## 4 "Read what people are saying about working here. Job Title\n\nHealthcare…
## 5 "Read what people are saying about working here. \n\nPosition Descriptio…
The job category provides some context of the job listing and will be used to capture important words per category.
jobs_cat <- tibble(id = jobs_df$uniq_id,
category = jobs_df$category)
jobs_cat <- jobs_cat %>% filter(jobs_cat$category != "") %>% print(n=10)
## # A tibble: 1,643 x 2
## id category
## <chr> <chr>
## 1 3b6c6acfcba6135a31c83bd7ea493b18 Accounting/Finance
## 2 1c8541cd2c2c924f9391c7d3f526f64e Accounting/Finance
## 3 445652a560a5441060857853cf267470 biotech
## 4 9571ec617ba209fd9a4f842973a4e9c8 Accounting/Finance
## 5 0ec629c03f3e82651711f2626c23cadb Accounting/Finance
## 6 80d64b46bc7c89602f63daf06b9f1b4c Accounting/Finance
## 7 9cd3ed78e5cac9e516ea41173de2c25f Computer/Internet
## 8 53224da901548e7137bbb163d456ba6a Computer/Internet
## 9 010b5d671896d26eba50948c7c337f94 Computer/Internet
## 10 4a8b875d1acc4f560716561d699aa022 Computer/Internet
## # … with 1,633 more rows
From the job_description, tokenize all the words and remove “stop_words” which are common words in the English language to allow for focus on meaningful words of the job listing.
# Use tidytext’s unnest_tokens() for the description field so we can do the text analysis.
# unnest_tokens() will tokenize all the words in the description field and create a tidy dataframe of the word by identifer
jobs_desc <- jobs_desc %>%
unnest_tokens(word, desc) %>%
anti_join(stop_words)
## Joining, by = "word"
jobs_desc
## # A tibble: 610,000 x 2
## id word
## <chr> <chr>
## 1 3b6c6acfcba6135a31c83bd7ea493b18 read
## 2 3b6c6acfcba6135a31c83bd7ea493b18 people
## 3 3b6c6acfcba6135a31c83bd7ea493b18 farmers
## 4 3b6c6acfcba6135a31c83bd7ea493b18 join
## 5 3b6c6acfcba6135a31c83bd7ea493b18 team
## 6 3b6c6acfcba6135a31c83bd7ea493b18 diverse
## 7 3b6c6acfcba6135a31c83bd7ea493b18 professionals
## 8 3b6c6acfcba6135a31c83bd7ea493b18 farmers
## 9 3b6c6acfcba6135a31c83bd7ea493b18 acquire
## 10 3b6c6acfcba6135a31c83bd7ea493b18 skills
## # … with 609,990 more rows
Provide count in table form of the most common words in the job descriptions.
# Most common words in the description field
jobs_desc %>%
count(word, sort = TRUE) %>% print(n=10)
## # A tibble: 16,753 x 2
## word n
## <chr> <int>
## 1 data 23515
## 2 experience 10467
## 3 business 5479
## 4 learning 4599
## 5 science 4423
## 6 analytics 3999
## 7 analysis 3681
## 8 team 3651
## 9 skills 3648
## 10 machine 3585
## # … with 1.674e+04 more rows
Applying lowercase to all the words to ensure different cases of the same word aren’t considered different.
# lowercase all the words just to make sure there's no redundancy
jobs_desc <- jobs_desc %>%
mutate(word = tolower(word))
The term frequency times inverse document frequency (TF-IDF) is used to identify words that are especially important to a document within a collection of documents. The results are the most important words in the description fields as measured by TF-IDF, meaning the words are common but not too common.
# Calculating tf-idf for the description fields
desc_tf_idf <- jobs_desc %>%
count(id, word, sort = TRUE) %>%
ungroup() %>%
bind_tf_idf(word, id, n)
desc_tf_idf %>% filter(n >= 10) %>%
arrange(-tf_idf)
## # A tibble: 2,677 x 6
## id word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 3060feb695b8e87923f8a58cf24fef40 nbsp 21 0.0868 5.11 0.444
## 2 5864a13f0123fbe568f87b25d5fd74df bull 21 0.0595 6.91 0.411
## 3 948b573aa265210fa845a2309201a079 beacon 13 0.0526 6.91 0.364
## 4 5864a13f0123fbe568f87b25d5fd74df nbsp 25 0.0708 5.11 0.362
## 5 948b573aa265210fa845a2309201a079 hill 13 0.0526 6.50 0.342
## 6 0ec629c03f3e82651711f2626c23cadb licensing 14 0.0601 5.65 0.340
## 7 919e0dff20e8ef9c673a2438861c3a24 munich 11 0.044 7.60 0.334
## 8 754bd14947b7e23f004405174c46622e scripps 12 0.0432 7.60 0.328
## 9 3fc6d7717e5b98369a49ecf9bd39d85b biztalk 13 0.0451 6.91 0.312
## 10 64968dba1885b3593bc60d21c6dd7b3f biztalk 13 0.0451 6.91 0.312
## # … with 2,667 more rows
The join is performed on the unique ID as key. Joining with the categories will identify the most important words from the job descriptions per job category.
# Join with the category
desc_tf_idf <- full_join(desc_tf_idf, jobs_cat, by = "id")
desc_tf_idf %>%
filter(!near(tf, 1)) %>%
filter(category %in% jobs_cat$category) %>%
arrange(desc(tf_idf)) %>%
group_by(category) %>%
distinct(word, category, .keep_all = TRUE) %>%
top_n(8, tf_idf) %>%
ungroup() %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
ggplot(aes(word, tf_idf, fill = category)) +
geom_col(show.legend = FALSE) +
facet_wrap(~category, ncol = 3, scales = "free") +
coord_flip() +
labs(title = "Highest tf-idf words in job listing description fields",
caption = "From jobpickr dataset",
x = NULL, y = "tf-idf")
The resulting plot did not prove useful for identifying skills across all the job listings for data scientist. The plot does indicate which words are more common across that specific job category. The results demonstrate that job listings by category are likely posted by the same company or same recruiter and thus the same boilerplate description is often used across many job listings.
In order to peform topic modeling, a document term matrix is required.
# 8.4 Topic Modeling
# Casting to a document-term matrix
word_counts <- jobs_desc %>%
count(id, word, sort = TRUE) %>%
ungroup()
word_counts %>% print(n=10)
## # A tibble: 381,507 x 3
## id word n
## <chr> <chr> <int>
## 1 3d8f0398cc54891ea8882148c5ccbc94 data 58
## 2 049784c976f14a08fd6ad657112ee142 data 56
## 3 97e787bb231a3947d4535d7caccc8a2b data 54
## 4 ad142ec84fe65c670d841b2209932020 data 53
## 5 65be41bc1431a3d7be4481500e43966c data 52
## 6 e5141eb0203ff1f338de433faa14581a based 52
## 7 e5141eb0203ff1f338de433faa14581a residence 52
## 8 2e3f084a7c9189ae5656a4e253e1878f data 51
## 9 38e1881e6ae9ed06bced92e784ce2216 data 50
## 10 5deab449ec2f808c4947a009aa1db19a data 50
## # … with 3.815e+05 more rows
The resulting document-term matrix indicates a high level of sparsity. The non-zero entries do correspond to a certain word appearing in a particular document.
# Construct DTM
desc_dtm <- word_counts %>%
cast_dtm(id, word, n)
desc_dtm
## <<DocumentTermMatrix (documents: 1998, terms: 16753)>>
## Non-/sparse entries: 381507/33090987
## Sparsity : 99%
## Maximal term length: 60
## Weighting : term frequency (tf)
According to Wikipedia, “In natural language processing, the latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar.”
# Rrunning this model is time intensive
# Define there to be 16 topics.
desc_lda <- LDA(desc_dtm, k = 16, control = list(seed = 1234))
desc_lda
## A LDA_VEM topic model with 16 topics.
# Interpreting the data model
tidy_lda <- tidy(desc_lda)
tidy_lda
## # A tibble: 268,048 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 data 0.0314
## 2 2 data 0.0274
## 3 3 data 0.0167
## 4 4 data 0.0530
## 5 5 data 0.0648
## 6 6 data 0.0458
## 7 7 data 0.0379
## 8 8 data 0.0287
## 9 9 data 0.0475
## 10 10 data 0.0270
## # … with 268,038 more rows
# Top 10 Terms
top_terms <- tidy_lda %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
## # A tibble: 160 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 data 0.0314
## 2 1 scientist 0.0177
## 3 1 experience 0.0151
## 4 1 job 0.0130
## 5 1 skills 0.00973
## 6 1 location 0.00902
## 7 1 python 0.00789
## 8 1 healthcare 0.00741
## 9 1 strong 0.00595
## 10 1 ca 0.00581
## # … with 150 more rows
Even though the topics are anonymous, only identified by number, the groupings of words show some similarities and differences, but do not necessarily provide much value at this point.
The topic modeling process has identified groupings of terms that we can understand as human readers of these description fields.
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_x_reordered() +
labs(title = "Top 10 terms in each LDA topic",
x = NULL, y = expression(beta)) +
facet_wrap(~ topic, ncol = 4, scales = "free")
Gamma will define the probability that each document belongs in each topic.
# LDA gamma
lda_gamma <- tidy(desc_lda, matrix = "gamma")
lda_gamma
## # A tibble: 31,968 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 3d8f0398cc54891ea8882148c5ccbc94 1 0.0000274
## 2 049784c976f14a08fd6ad657112ee142 1 0.0000340
## 3 97e787bb231a3947d4535d7caccc8a2b 1 0.0000275
## 4 ad142ec84fe65c670d841b2209932020 1 0.0000605
## 5 65be41bc1431a3d7be4481500e43966c 1 0.0000313
## 6 e5141eb0203ff1f338de433faa14581a 1 1.000
## 7 2e3f084a7c9189ae5656a4e253e1878f 1 0.0767
## 8 38e1881e6ae9ed06bced92e784ce2216 1 0.0000264
## 9 5deab449ec2f808c4947a009aa1db19a 1 0.0000467
## 10 ea38905f0f2700d82861bb12d384b378 1 0.0000382
## # … with 31,958 more rows
lda_gamma <- full_join(lda_gamma, jobs_cat, by = c("document" = "id"))
lda_gamma
## # A tibble: 32,000 x 4
## document topic gamma category
## <chr> <int> <dbl> <chr>
## 1 3d8f0398cc54891ea8882148c5cc… 1 0.0000274 Arts/Entertainment/Publis…
## 2 049784c976f14a08fd6ad657112e… 1 0.0000340 Sales
## 3 97e787bb231a3947d4535d7caccc… 1 0.0000275 Computer/Internet
## 4 ad142ec84fe65c670d841b220993… 1 0.0000605 Insurance
## 5 65be41bc1431a3d7be4481500e43… 1 0.0000313 Manufacturing/Mechanical
## 6 e5141eb0203ff1f338de433faa14… 1 1.000 Manufacturing/Mechanical
## 7 2e3f084a7c9189ae5656a4e253e1… 1 0.0767 math jobs
## 8 38e1881e6ae9ed06bced92e784ce… 1 0.0000264 government
## 9 5deab449ec2f808c4947a009aa1d… 1 0.0000467 Computer/Internet
## 10 ea38905f0f2700d82861bb12d384… 1 0.0000382 Manufacturing/Mechanical
## # … with 31,990 more rows
top_cats <- lda_gamma %>%
filter(gamma > 0.5) %>%
count(topic, category, sort = TRUE)
top_cats <- top_cats %>% filter(!is.na(category))
Topic 9 identifes ‘business and financial operations’ as the top category, and the only topic to include the term ‘aws’. Topic 4, most identified with category ‘Arts/Entertainment/Publishing’ contains the terms ‘experience’ and ‘content’ which align with the category broadly.
# One more graph from 8.4.4
top_cats %>%
group_by(topic) %>%
top_n(5, n) %>%
ungroup %>%
mutate(category = reorder_within(category, n, topic)) %>%
ggplot(aes(category, n, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
labs(title = "Top categories for each LDA topic",
x = NULL, y = "Number of documents") +
coord_flip() +
scale_x_reordered() +
facet_wrap(~ topic, ncol = 2, scales = "free")
Following the example at https://www.r-bloggers.com/advancing-text-mining-with-r-and-quanteda/ and https://quanteda.io/articles/pkgdown/examples/plotting.html
# Generate a corpus
uniq_jobs_df <- jobs_df %>% distinct(uniq_id, .keep_all = TRUE)
my_corpus <- corpus(uniq_jobs_df, docid_field = "uniq_id", text_field = "job_description")
mycorpus_stats <- summary(my_corpus)
Remove numbers, remove punctuation, remove symbols, remove URLs, split hyphens. Clean for OCR.
# Preprocess the text
# Create tokens
token <-
tokens(
my_corpus,
remove_numbers = TRUE,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_url = TRUE,
split_hyphens = TRUE
)
# Clean tokens created by OCR
token_ungd <- tokens_select(
token,
c("[\\d-]", "[[:punct:]]", "^.{1,2}$"),
selection = "remove",
valuetype = "regex",
verbose = TRUE
)
## removed 1,737 features
Using the Quanteda library, create the data frequency matrix and filter words that appear less than 7.5% and more than 90%.
# Data frequency matrix
my_dfm <- dfm(token_ungd,
tolower = TRUE,
stem = TRUE,
remove = stopwords("english")
)
my_dfm_trim <-
dfm_trim(
my_dfm,
min_docfreq = 0.075,
# min 7.5%
max_docfreq = 0.90,
# max 90%
docfreq_type = "prop"
)
head(dfm_sort(my_dfm_trim, decreasing = TRUE, margin = "both"),
n = 10,
nf = 10)
## Document-feature matrix of: 10 documents, 10 features (4.0% sparse) and 20 docvars.
## features
## docs work analyt busi develop model learn
## 0f4fc982e72ca26aaad1f66637019652 15 10 6 10 6 6
## 9f2a8aee7e7a102720cfb3dbc39c6c24 11 2 0 28 4 18
## 6737d173f0bdf3d4d8fd3543518e40c7 11 2 0 30 4 18
## 3fcf1d237cf41524a247f9b940393b95 20 40 22 18 18 4
## 1ed541e018a0ff4788fcb7adbece18b2 15 18 24 16 24 8
## 7aa14f095302ee73feaa651c7430a0ad 10 9 2 16 6 6
## features
## docs team scienc statist requir
## 0f4fc982e72ca26aaad1f66637019652 12 12 10 18
## 9f2a8aee7e7a102720cfb3dbc39c6c24 18 10 10 16
## 6737d173f0bdf3d4d8fd3543518e40c7 18 10 10 16
## 3fcf1d237cf41524a247f9b940393b95 18 0 12 4
## 1ed541e018a0ff4788fcb7adbece18b2 14 4 10 12
## 7aa14f095302ee73feaa651c7430a0ad 1 9 5 30
## [ reached max_ndoc ... 4 more documents ]
Plot shows the occurrences of the term ‘python’ and ‘r’ in across all documents for the state of Oregon (OR). The state was chosen to give a natural subset of all documents initially included.
The lexical dispersion appears to indicate the use of the terms ‘python’ and ‘r’ often occur in conjunction in the documents (job descriptions) which would indicate the listings are listing the two programming languages in or near the same sentence.
my_corpus_sub <- corpus_subset(my_corpus, state == "OR")
theme_set(theme_bw())
g <- textplot_xray(
kwic(my_corpus_sub, pattern = "python"),
kwic(my_corpus_sub, pattern = "r")
)
g + aes(color = keyword) +
scale_color_manual(values = c("blue", "red")) +
theme(legend.position = "none")
From the tutorial at https://www.tidytextmining.com/. The bigrams identify the word pairs that occur the most frequently
This exercise finds bigrams of two words. The function does allow for bigrams of greater than two.
jobs_bigrams <- jobs_df %>%
unnest_tokens(bigram, job_description, token = "ngrams", n = 2)
jobs_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 178,989 x 2
## bigram n
## <chr> <int>
## 1 machine learning 3501
## 2 data scientist 3193
## 3 of the 2970
## 4 data science 2938
## 5 experience with 2794
## 6 in the 2598
## 7 ability to 2555
## 8 experience in 2035
## 9 in a 1928
## 10 years of 1768
## # … with 178,979 more rows
bigrams_separated <- jobs_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
# This result is valuable
bigram_counts %>% print(n = 20)
## # A tibble: 93,532 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 machine learning 3501
## 2 data scientist 3193
## 3 data science 2938
## 4 computer science 1033
## 5 data sets 950
## 6 data analysis 925
## 7 equal opportunity 860
## 8 data mining 823
## 9 national origin 710
## 10 data scientists 662
## 11 veteran status 658
## 12 communication skills 627
## 13 sexual orientation 623
## 14 opportunity employer 621
## 15 data analytics 594
## 16 data sources 593
## 17 color religion 561
## 18 gender identity 553
## 19 race color 545
## 20 qualified applicants 536
## # … with 9.351e+04 more rows
Include bigrams that occurred at least 250 times in order to filter out visual noise.
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
filter(n > 250) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH 418ed33 DN-- 73 53 --
## + attr: name (v/c), n (e/n)
## + edges from 418ed33 (vertex names):
## [1] machine ->learning data ->scientist
## [3] data ->science computer ->science
## [5] data ->sets data ->analysis
## [7] equal ->opportunity data ->mining
## [9] national ->origin data ->scientists
## [11] veteran ->status communication->skills
## [13] sexual ->orientation opportunity ->employer
## [15] data ->analytics data ->sources
## + ... omitted several edges
set.seed(2020)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
set.seed(2021)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
Simple bar plot of the most common words across all the job listings’ descriptions.
jobs_words <- jobs_df %>%
unnest_tokens(word, job_description) %>%
anti_join(stop_words) %>%
count(uniq_id, word, sort = TRUE)
## Joining, by = "word"
total_words <- jobs_words %>%
group_by(uniq_id) %>%
summarize(total = sum(n))
jobs_words <- left_join(jobs_words, total_words)
## Joining, by = "uniq_id"
jobs_words <- jobs_words %>%
anti_join(stop_words)
## Joining, by = "word"
jobs_words %>%
count(word, sort = TRUE)
## # A tibble: 16,753 x 2
## word n
## <chr> <int>
## 1 data 1971
## 2 experience 1836
## 3 python 1541
## 4 scientist 1523
## 5 learning 1464
## 6 science 1445
## 7 team 1410
## 8 skills 1398
## 9 machine 1371
## 10 business 1353
## # … with 16,743 more rows
jobs_words %>%
count(word, sort = TRUE) %>%
filter(n > 500) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()